How to fit a Logistic Regression using Bayesian Methods
The advantage of using Bayes to overcome the Separation in Logistic
Regression
mydf <- read.csv(file = 'data/HtWtData300.csv')
head(mydf) male height weight
1 0 64.0 136.4
2 0 62.3 215.1
3 1 67.9 173.6
4 0 64.2 117.3
5 0 64.8 123.3
6 0 57.5 96.5
Probability close to 0.5
table(mydf$male)
0 1
152 148
prop.table(table(mydf$male))
0 1
0.5066667 0.4933333
Firstly, we try to fit with glm
fmod1 <- glm(male ~ height + weight,
family = "binomial", data = mydf)
summary(fmod1)
Call:
glm(formula = male ~ height + weight, family = "binomial", data = mydf)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.7751 -0.4674 -0.0165 0.3655 3.1906
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -59.994880 7.136907 -8.406 < 2e-16 ***
height 0.890858 0.108873 8.183 2.78e-16 ***
weight 0.005320 0.005225 1.018 0.309
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 415.83 on 299 degrees of freedom
Residual deviance: 189.49 on 297 degrees of freedom
AIC: 195.49
Number of Fisher Scoring iterations: 6
\(\Rightarrow\) We will see the estimates as same as the Bayesian method.
plot(mydf$height,
mydf$weight,pch=16, col="blue")
points(mydf$height[mydf$male==1],
mydf$weight[mydf$male==1],
col="orange", pch=16)
legend("topleft", legend=c("Female","Male"),
col=c("blue","orange"), pch=16)Model for Dichotomous y with multiple metric predictors set up in
Stan
But firstly, we look at the simplified modelString of
Stan:
# Not eval
modelString <- "
data{
int<lower=0> N;
real xc[N];
int xb[N];
int<lower=0,upper=1> y[N]; //Binary outcome
}
parameters{
real b0;
real b1;
real b2;
}
model{
b0 ~ normal(0,1);
b1 ~ normal(0,1);
b2 ~ normal(0,1);
y ~ bernoulli_logit(b0 + b1*xc + b2*xb);
}
"Then move on the destination one:
modelString <- "
data {
int<lower=1> Ntotal; // num of observations
int<lower=1> Nx; // num of predictors
int<lower=0, upper=1> y[Ntotal];
matrix[Ntotal, Nx] x;
}
transformed data {
vector[Nx] meanX;
vector[Nx] sdX;
matrix[Ntotal, Nx] zx; // normalized
for ( j in 1:Nx ) {
meanX[j] = mean(x[,j]);
sdX[j] = sd(x[,j]);
for (i in 1:Ntotal) {
zx[i,j] = (x[i,j] - meanX[j]) / sdX[j];
}
}
}
parameters {
real zbeta0;
vector[Nx] zbeta;
}
transformed parameters{
vector[Ntotal] mu;
mu = zbeta0 + zx * zbeta; // matrix product
}
model {
zbeta0 ~ normal(0, 2);
zbeta ~ normal(0, 2);
y ~ bernoulli_logit(mu);
}
generated quantities {
// Transform to original scale:
real beta0;
vector[Nx] beta;
// .* and ./ are element-wise product and division
beta0 = zbeta0 - sum(zbeta .* meanX ./ sdX);
beta = zbeta ./ sdX;
}
"stanDsoLogistic <- stan_model(model_code=modelString)
# save(stanDsoLogistic, file = "data/stanLogisticDso.Rdata")load(file = "data/stanLogisticDso.Rdata")Fit model
heightWeightDataList <- list(Ntotal = nrow(mydf),
y = mydf$male,
x = cbind(mydf$height, mydf$weight),
Nx = 2)fit <- sampling(stanDsoLogistic,
data = heightWeightDataList,
pars = c('beta0', 'beta'),
iter = 5000, chains = 2, cores = 2
)Analyze fitted model using shinystan
launch_shinystan(fit)Analyze parameters.
stan_ac(fit, separate_chains = T)pairs(fit)plot(fit)plot(fit,pars=c("beta"))plot(fit,pars=c("beta[2]"))summary(fit)$summary[,c(1,4,8)] mean 2.5% 97.5%
beta0 -59.074628416 -7.332501e+01 -46.74736282
beta[1] 0.876102572 6.866473e-01 1.08922356
beta[2] 0.005720759 -4.493517e-03 0.01585729
lp__ -97.682840004 -1.008467e+02 -96.28774967
Parameter \(\beta_2\) is not significant with 95% HDI.
stan_dens(fit)estimBetas<-summary(fit)$summary[1:3,1]Plot the data with separating hyperplane.
plot(mydf$height, mydf$weight, pch=16, col="blue")
points(mydf$height[mydf$male==1], mydf$weight[mydf$male==1], col = "orange", pch = 16)
lines(mydf$height, -(estimBetas[1]+estimBetas[2]*mydf$height)/estimBetas[3])
legend("topleft", legend = c("Female","Male"), col = c("blue","orange"), pch=16)Now try to remove almost all males from the sample to see what may happen when there are few 1’s observed.
In the original sample the proportion of males is:
mean(mydf$male)[1] 0.4933333
Sample with females is
females <- mydf[mydf$male == 0,]Select first 15 males.
males <- mydf[mydf$male == 1,][1:15,] # just 15 males (originally was ~150)
mydf_sparse <- rbind(males,females)
rownames(mydf_sparse) <- NULL
head(mydf_sparse, 20) male height weight
1 1 67.9 173.6
2 1 70.2 191.1
3 1 71.1 193.9
4 1 66.5 127.1
5 1 75.1 204.4
6 1 64.6 143.4
7 1 69.2 124.4
8 1 68.1 140.9
9 1 72.6 164.7
10 1 71.5 193.6
11 1 76.0 180.0
12 1 69.7 155.0
13 1 73.3 188.2
14 1 68.3 178.6
15 1 70.8 207.3
16 0 64.0 136.4
17 0 62.3 215.1
18 0 64.2 117.3
19 0 64.8 123.3
20 0 57.5 96.5
Fit sparse model
fmod2 <- glm(male ~ height + weight, family = "binomial", data = mydf_sparse)
summary(fmod2)
Call:
glm(formula = male ~ height + weight, family = "binomial", data = mydf_sparse)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.28353 -0.17117 -0.06829 -0.01696 3.15051
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -84.24302 20.03395 -4.205 2.61e-05 ***
height 1.25377 0.30728 4.080 4.50e-05 ***
weight -0.01190 0.01248 -0.953 0.34
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 100.909 on 166 degrees of freedom
Residual deviance: 35.582 on 164 degrees of freedom
AIC: 41.582
Number of Fisher Scoring iterations: 8
heightWeightSparseDataList<-list(Ntotal=nrow(mydf_sparse),
y=mydf_sparse$male,
x=cbind(mydf_sparse$height, mydf_sparse$weight),
Nx=2)fit_sparse <- sampling(stanDsoLogistic,
data=heightWeightSparseDataList,
pars=c('beta0', 'beta'),
iter=5000, chains = 2, cores = 2
)stan_ac(fit_sparse, separate_chains = T)pairs(fit_sparse)plot(fit_sparse)plot(fit_sparse,pars=c("beta"))plot(fit_sparse,pars=c("beta[2]"))Compare summary of the two studies
rbind(beta0reg=summary(fit)$summary[1,c(1,3)],
beta0glm=c(summary(fmod1)$coefficients[1,1],summary(fmod1)$coefficients[1,2]*sqrt(dim(mydf)[1])),
beta0sparce=summary(fit_sparse)$summary[1,c(1,3)],
beta0sparceglm=c(summary(fmod2)$coefficients[1,1],summary(fmod2)$coefficients[1,2]*sqrt(dim(mydf_sparse)[1]))) mean sd
beta0reg -59.07463 6.749991
beta0glm -59.99488 123.614858
beta0sparce -65.48010 12.250007
beta0sparceglm -84.24302 258.895713
rbind(beta1reg=summary(fit)$summary[2,c(1,3)],
beta1glm=c(summary(fmod1)$coefficients[2,1],summary(fmod1)$coefficients[2,2]*sqrt(dim(mydf)[1])),
beta1sparce=summary(fit_sparse)$summary[2,c(1,3)],
beta1sparceglm=c(summary(fmod2)$coefficients[2,1],summary(fmod2)$coefficients[2,2]*sqrt(dim(mydf_sparse)[1]))) mean sd
beta1reg 0.8761026 0.1030361
beta1glm 0.8908578 1.8857272
beta1sparce 0.9664459 0.1882613
beta1sparceglm 1.2537728 3.9708903
rbind(beta2reg=summary(fit)$summary[3,c(1,3)],
beta2glm=c(summary(fmod1)$coefficients[3,1],summary(fmod1)$coefficients[3,2]*sqrt(dim(mydf)[1])),
beta2sparce=summary(fit_sparse)$summary[3,c(1,3)],
beta2sparceglm=c(summary(fmod2)$coefficients[3,1],summary(fmod2)$coefficients[3,2]*sqrt(dim(mydf_sparse)[1]))) mean sd
beta2reg 0.005720759 0.005286843
beta2glm 0.005319759 0.090499569
beta2sparce -0.007041950 0.010449550
beta2sparceglm -0.011900490 0.161304086
rbind(beta0reg=summary(fit)$summary[1,c(4,8)],
beta0sparce=summary(fit_sparse)$summary[1,c(4,8)]) 2.5% 97.5%
beta0reg -73.32501 -46.74736
beta0sparce -90.86386 -43.48094
rbind(beta1reg=summary(fit)$summary[2,c(4,8)],
beta1sparce=summary(fit_sparse)$summary[2,c(4,8)]) 2.5% 97.5%
beta1reg 0.6866473 1.089224
beta1sparce 0.6268317 1.359368
rbind(beta2reg=summary(fit)$summary[3,c(4,8)],
beta2sparce=summary(fit_sparse)$summary[3,c(4,8)]) 2.5% 97.5%
beta2reg -0.004493517 0.01585729
beta2sparce -0.028395094 0.01216875
HDI of both slopes widened significantly in the sample with
more extreme disproportion.
Standard deviations of betas also increase dramatically,
especially fit with glm.
Observe the data of the previous section.
Plot male (1) and female (0) groups with respect to weight.
plot(mydf$weight,mydf$male)In the lower right corner there are some outliers representing heavy
females.
Such observations cause bias of model parameters.
plot(mydf$height,mydf$male)On the plot relative to height outliers are short males.
What does a model robust to such outliers look like?
Robust logistic regression is a mix of “guessing model” and logistic model
\[\mu = \alpha \frac{1}{2} + (1 - \alpha) \ \text{logistic}\Big(\beta_0 + \sum_j \beta_j x_j\Big)\]
When \(\alpha = 0\) all observations fit within logistic regression pattern, i.e. the classes are separated well enough to be explained by logistic sigmoid function. In this case the model becomes pure logistic regression.
When \(\alpha = 1\) the classes overlap completely and the model can only guess with probability 0.5 to which class the observation belongs.
Typically \(\alpha\) is small (few
outliers showing that predictor points to wrong class).
Select beta distribution as a prior to \(\alpha\) with high concentration near zero:
dbeta(1,9).
Argument <- seq(from=0,to=1,by=.01)
plot(Argument, dbeta(Argument, 1, 9), type="l")The modified model is on the .
modelString=
"data { // ROBUST LOGISTIC REGRESSION
int<lower=1> Ntotal; // num of observations
int<lower=1> Nx; // num of predictors
int<lower=0, upper=1> y[Ntotal];
matrix[Ntotal, Nx] x;
}
transformed data {
vector[Nx] meanX;
vector[Nx] sdX;
matrix[Ntotal, Nx] zx; // normalized
for ( j in 1:Nx ) {
meanX[j] = mean(x[,j]);
sdX[j] = sd(x[,j]);
for ( i in 1:Ntotal ) {
zx[i,j] = ( x[i,j] - meanX[j] ) / sdX[j];
}
}
}
parameters {
real zbeta0;
vector[Nx] zbeta;
real<lower=0,upper=1> guess; // mixture param
}
transformed parameters{
vector[Ntotal] mu;
for ( i in 1:Ntotal ) {
mu[i] = guess * (1/2.0) + (1-guess) * inv_logit(zbeta0 + zx[i,] * zbeta);
}
}
model {
zbeta0 ~ normal(0, 2);
zbeta ~ normal(0, 2);
guess ~ beta(1, 9);
y ~ bernoulli(mu);
}
generated quantities {
// Transform to original scale:
real beta0;
vector[Nx] beta;
// .* and ./ are element-wise product and division
beta0 = zbeta0 - sum( zbeta .* meanX ./ sdX );
beta = zbeta ./ sdX;
}
"stanDsoRobustLogistic <- stan_model(model_code=modelString)# save(stanDsoRobustLogistic, file = "data/stanRobustLogisticDso.Rds")
load("data/stanRobustLogisticDso.Rds")Run robust MCMC with the hight/weight data.
fitRobust <- sampling(stanDsoRobustLogistic,
data=heightWeightDataList,
pars=c('beta0', 'beta', 'guess'),
iter=5000, chains = 2, cores = 2)Analyze results.
stan_ac(fitRobust, separate_chains = T)pairs(fitRobust)plot(fitRobust)plot(fitRobust,pars=c("beta[1]"))plot(fitRobust,pars=c("beta[2]"))plot(fitRobust,pars=c("guess"))rbind(summary(fitRobust)$summary[,c(1,4,7)],
summary(fit)$summary[,c(1,4,8)]
) mean 2.5% 75%
beta0 -6.759671e+01 -9.033603e+01 -60.44830982
beta[1] 9.987992e-01 7.405794e-01 1.08941932
beta[2] 7.863251e-03 -4.898361e-03 0.01200465
guess 3.520801e-02 1.562728e-03 0.04978482
lp__ -1.020473e+02 -1.058665e+02 -100.91970991
beta0 -5.907463e+01 -7.332501e+01 -46.74736282
beta[1] 8.761026e-01 6.866473e-01 1.08922356
beta[2] 5.720759e-03 -4.493517e-03 0.01585729
lp__ -9.768284e+01 -1.008467e+02 -96.28774967
Note positive correlation between guess and the slope \(\beta_1\). The more outliers the higher is the slope.
Since \(\beta_2\) does not seem significant. Now, we fit both models with height as only predictor.
heightWeightDataList<-list(Ntotal=nrow(mydf),
y=mydf$male,
x=cbind(mydf$height),
Nx=1)
fit <- sampling(stanDsoLogistic,
data=heightWeightDataList,
pars=c('beta0', 'beta'),
iter=5000, chains = 2, cores = 2
)
fitRobust <- sampling(stanDsoRobustLogistic,
data=heightWeightDataList,
pars=c('beta0', 'beta', 'guess'),
iter=5000, chains = 2, cores = 2
)
pairs(fit)pairs(fitRobust)plot(fit)plot(fit,pars=c("beta"))plot(fitRobust)plot(fitRobust,pars=c("beta[1]","guess"))plot(fitRobust,pars=c("guess"))rbind(summary(fitRobust)$summary[,c(1,4,7)],
summary(fit)$summary[,c(1,4,8)]
) mean 2.5% 75%
beta0 -66.86054035 -8.942821e+01 -59.89807734
beta[1] 1.00658061 7.500421e-01 1.09452331
guess 0.03199073 1.072547e-03 0.04549365
lp__ -102.27095400 -1.058139e+02 -101.30279477
beta0 -59.24314304 -7.275107e+01 -47.12218747
beta[1] 0.89230096 7.081439e-01 1.09712292
lp__ -97.71023494 -1.002870e+02 -96.75923890
Compare probabilities predicted by logistic regression and robust logistic regression.
# Coefficients
meanBeta0Robust<-summary(fitRobust)$summary[1,1]
meanBeta1Robust<-summary(fitRobust)$summary[2,1]
guess<-summary(fitRobust)$summary[3,1]
meanBeta0<-summary(fit)$summary[1,1]
meanBeta1<-summary(fit)$summary[2,1]
#Linear predictors and probabilities
linPredRobust_Male.Height<-meanBeta0Robust+meanBeta1Robust*mydf$height
pRobustMail_height<-guess/2+(1-guess)*exp(linPredRobust_Male.Height)/(1+exp(linPredRobust_Male.Height))
linPred_Male.Height<-meanBeta0+meanBeta1*mydf$height
pMail_height<-exp(linPred_Male.Height)/(1+exp(linPred_Male.Height))
# Plot
plot(mydf$height,mydf$male,pch=16)
points(mydf$height,pRobustMail_height,col="orange",pch=16)
points(mydf$height,pMail_height,col="cyan",pch=16)
legend("topleft",
legend=c("Actual","Prob Logistic","Prob. Robust"),
col=c("black","cyan","orange"), pch=16)Logistic probability is more extreme: higher in the area of
low-height male observations because it is biased by an outlier of short
male below the average female height. But the far tails are closer to
0.
Robust logistic regression did not react to this observation as
much!!
Robust model does not produce probabilities too close to 0 and 1.
Repeat the same comparison with the sparse data.
Create data list with one predictor.
heightWeightSparseDataList <- list(Ntotal = nrow(mydf_sparse),
y = mydf_sparse$male,
x = cbind(mydf_sparse$height),
Nx = 1)Fit both models with only one predictor height
fitSparse <- sampling(stanDsoLogistic,
data=heightWeightSparseDataList,
pars=c('beta0', 'beta'),
iter=5000, chains = 2, cores = 2)
fitSparseRobust <- sampling(stanDsoRobustLogistic,
data=heightWeightSparseDataList,
pars=c('beta0', 'beta', 'guess'),
iter=5000, chains = 2, cores = 2)Analyze the models.
pairs(fitSparse)pairs(fitSparseRobust)plot(fitSparse)plot(fitSparse,pars=c("beta"))plot(fitSparseRobust)plot(fitSparseRobust,pars=c("beta[1]","guess"))plot(fitSparseRobust,pars=c("guess"))rbind(summary(fitSparseRobust)$summary[,c(1,4,7)],
summary(fitSparse)$summary[,c(1,4,8)]
) mean 2.5% 75%
beta0 -65.43711171 -9.393761e+01 -56.03873197
beta[1] 0.94912626 6.055210e-01 1.07111437
guess 0.01689194 5.152762e-04 0.02448201
lp__ -28.85479455 -3.232575e+01 -27.89492051
beta0 -63.02455548 -8.871818e+01 -42.69724337
beta[1] 0.91384987 6.114798e-01 1.29416346
lp__ -23.14591098 -2.575415e+01 -22.20307431
Make plot of probabilities.
# Coefficients
meanBeta0Robust<-summary(fitSparseRobust)$summary[1,1]
meanBeta1Robust<-summary(fitSparseRobust)$summary[2,1]
guess<-summary(fitSparseRobust)$summary[3,1]
meanBeta0<-summary(fitSparse)$summary[1,1]
meanBeta1<-summary(fitSparse)$summary[2,1]
#Linear predictors and probabilities
linPredRobust_Male.Height<-meanBeta0Robust+meanBeta1Robust*mydf_sparse$height
pRobustMail_height<-guess/2+(1-guess)*exp(linPredRobust_Male.Height)/(1+exp(linPredRobust_Male.Height))
linPred_Male.Height<-meanBeta0+meanBeta1*mydf_sparse$height
pMail_height<-exp(linPred_Male.Height)/(1+exp(linPred_Male.Height))
# Plot
plot(mydf_sparse$height,mydf_sparse$male,pch=16)
points(mydf_sparse$height,pRobustMail_height,col="orange",pch=16)
points(mydf_sparse$height,pMail_height,col="cyan",pch=16)
legend("topleft",
legend=c("Actual","Prob Logistic","Prob. Robust"),
col=c("black","cyan","orange"),pch=16)This data example is from library DAAG.
Thirty patients were given an anesthetic agent maintained at a
predetermined concentration level (conc) for 15 minutes
before making an incision. It was then noted whether the patient moved
(1), i.e. jerked or twisted.
library(DAAG)
head(anesthetic) move conc logconc nomove
1 0 1.0 0.0000000 1
2 1 1.2 0.1823216 0
3 0 1.4 0.3364722 1
4 1 1.4 0.3364722 0
5 1 1.2 0.1823216 0
6 0 2.5 0.9162907 1
Use column move as response and column
logconc as predictor.
Prepare the data.
dataListAnesthetic <- list(Ntotal=nrow(anesthetic),
y=anesthetic$move,
x=cbind(anesthetic$logconc),
Nx=1)glm()logRegr <- glm(move ~ logconc,
data=anesthetic,
family="binomial")
summary(logRegr)
Call:
glm(formula = move ~ logconc, family = "binomial", data = anesthetic)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.1477 -0.6962 -0.1209 0.7586 1.7528
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 0.8077 0.5709 1.415 0.15715
logconc -6.2457 2.2618 -2.761 0.00575 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 41.455 on 29 degrees of freedom
Residual deviance: 28.007 on 28 degrees of freedom
AIC: 32.007
Number of Fisher Scoring iterations: 5
predLogRegr <- predict(logRegr, type="response")Run MCMC using logistic and robust logistic models.
fitAnesth <- sampling(stanDsoLogistic,
data=dataListAnesthetic,
pars=c('beta0', 'beta'),
iter=5000, chains = 2, cores = 2
)
fitRobustAnesth <- sampling(stanDsoRobustLogistic,
data=dataListAnesthetic,
pars=c('beta0', 'beta', 'guess'),
iter=5000, chains = 2, cores = 2)Extract diagnostics
summary(fitAnesth)$summary[,c(1,4,8:10)] mean 2.5% 97.5% n_eff Rhat
beta0 0.8198095 -0.2232116 2.005003 3877.299 0.9998678
beta[1] -6.1749593 -10.8825706 -2.377365 3117.371 1.0000402
lp__ -15.4911255 -18.3350197 -14.471061 1721.835 1.0022634
stan_ac(fitAnesth, separate_chains = T)stan_trace(fitAnesth)pairs(fitAnesth,pars=c("beta0","beta"))plot(fitAnesth)summary(fitRobustAnesth)$summary[,c(1,4,8:10)] mean 2.5% 97.5% n_eff Rhat
beta0 1.02591748 -0.206024406 2.6568493 2882.397 1.0010848
beta[1] -7.40456341 -14.191363295 -2.7011459 2542.829 1.0008553
guess 0.09682537 0.003829574 0.2914329 4230.349 0.9996954
lp__ -19.49792255 -22.837487799 -18.0184837 1903.465 1.0013869
stan_ac(fitRobustAnesth, separate_chains = T)stan_trace(fitRobustAnesth)pairs(fitRobustAnesth,pars=c("beta0","beta","guess"))plot(fitRobustAnesth)plot(fitRobustAnesth,pars=c("guess"))Parameter guess is almost \(10%\).
This means that there should be a significant difference between the two
models.
However, 95% HDI almost covers \(0\).
Compare intercepts.
rbind(Logistic=summary(fitAnesth)$summary[1,c(1,4,8)],
Robust=summary(fitRobustAnesth)$summary[1,c(1,4,8)]) mean 2.5% 97.5%
Logistic 0.8198095 -0.2232116 2.005003
Robust 1.0259175 -0.2060244 2.656849
Compare slopes.
rbind(Logistic=summary(fitAnesth)$summary[2,c(1,4,8)],
Robust=summary(fitRobustAnesth)$summary[2,c(1,4,8)]) mean 2.5% 97.5%
Logistic -6.174959 -10.88257 -2.377365
Robust -7.404563 -14.19136 -2.701146
Compare probabilities.
# Coefficients
meanBeta0Robust<-summary(fitRobustAnesth)$summary[1,1]
meanBeta1Robust<-summary(fitRobustAnesth)$summary[2,1]
guess<-summary(fitRobustAnesth)$summary[3,1]
meanBeta0<-summary(fitAnesth)$summary[1,1]
meanBeta1<-summary(fitAnesth)$summary[2,1]
#Linear predictors and probabilities
linPredRobust_Move<-meanBeta0Robust+meanBeta1Robust*anesthetic$logconc
pRobustMove<-guess/2+(1-guess)*exp(linPredRobust_Move)/(1+exp(linPredRobust_Move))
linPred_Move<-meanBeta0+meanBeta1*anesthetic$logconc
pMove<-exp(linPred_Move)/(1+exp(linPred_Move))
# Plot
plot(anesthetic$logconc,anesthetic$move,pch=16)
points(anesthetic$logconc,pRobustMove,col="orange",pch=15)
points(anesthetic$logconc,pMove,col="cyan",pch=17)
points(anesthetic$logconc,predLogRegr,col="purple",pch=25)
legend("topright",
legend=c("Actual","Prob Logistic","Prob. Robust","Glm"),
col=c("black","cyan","orange","purple"),pch=c(16,17,15,25))Again, robust method does not return extreme probabilities.
Softmax (or multinomial logistic regression) is a generalization of logistic regression to the case where we want to handle multiple classes. In logistic regression, the response was binary: \(y^{(i)} \in \{0,1\}\), meanwhile softmax handles \(y^{(i)} \in \{1, ..., K\}\)
It is based on modification of odds ratio from
\[\frac{p}{1-p}\] to \[\frac{p_s}{p_r},\]
where \(p_r\) is probability of one of
several classes selected as reference (for example, control group).
The model changes very little from logistic regression and is shown
on this diagram.
Because adding constants to coefficients does not change softmax the reference class coefficients are assumed equal to zero.
The data SoftmaxRegData1.csv are from Kruschke, 2015
chapter 22.
myData <- read.csv(file="data/SoftmaxRegData2.csv" )
head(myData) X1 X2 Y
1 -0.08714736 -1.0813422 2
2 -0.72256565 -1.5838631 2
3 0.17918961 0.9717904 3
4 -1.15975176 0.5026244 3
5 -0.72711762 1.3757045 3
6 0.53341559 1.7746506 3
table(myData$Y, useNA = "always")
1 2 3 4 <NA>
58 145 139 133 0
idx2<-myData$Y==2
idx3<-myData$Y==3
idx4<-myData$Y==4
plot(myData$X1,myData$X2,pch=16)
points(myData$X1[idx2],myData$X2[idx2],pch=16,col="orange")
points(myData$X1[idx3],myData$X2[idx3],pch=16,col="cyan")
points(myData$X1[idx4],myData$X2[idx4],pch=16,col="magenta")Prepare data list
dataListSoftmax<-list(N=nrow(myData), # num of observations
K=max(myData$Y), # num of groups
y=myData$Y,
x=cbind(x1 = myData$X1, x2 = myData$X2),
D=2) # num of predictiorsDescribe the model.
modelString="
data {
int<lower=2> K; // num of groups
int<lower=0> N; // num of observations
int<lower=1> D; // num of predictors
int<lower=1,upper=K> y[N];
matrix[N, D] x;
}
transformed data {
row_vector[D] zeros;
row_vector[D] x_m; // x means
row_vector[D] x_sd; // x standard deviations
matrix[N, D] zx; // normalized x
zeros = rep_row_vector(0, D); // coefficients are zeros for the baseline class
for (j in 1:D) {
x_m[j] = mean(x[,j]);
x_sd[j] = sd(x[,j]);
zx[,j] = (x[,j] - x_m[j]) / x_sd[j];
}
}
parameters {
matrix[K-1,D] zbeta_raw; // K-1 makes model identifiable
vector[K-1] zbeta0_raw;
}
transformed parameters {
vector[K] zbeta0; // intersection coeffs
matrix[K, D] zbeta; // predictor coeffs
zbeta0 = append_row(0, zbeta0_raw);
zbeta = append_row(zeros, zbeta_raw); // add zeros for coefficients of the baseclass
}
model {
zbeta0_raw ~ normal(0, 5);
for (k in 1:(K-1))
zbeta_raw[k,] ~ normal(0, 5);
for (n in 1:N)
y[n] ~ categorical(softmax(zbeta0 + zbeta * to_vector(zx[n,]) ));
}
generated quantities {
vector[K] beta0;
matrix[K, D] beta;
// transform zbetas to original betas:
for (k in 1:K) {
beta0[k] = zbeta0[k];
for (j in 1:D) {
beta0[k] = beta0[k] - zbeta[k,j] * x_m[j] / x_sd[j];
beta[k,j] = zbeta[k,j] / x_sd[j];
}
}
}
"Create DSO.
modelSoftmax <- stan_model(model_code=modelString)
save(modelSoftmax, file = "data/stanSoftmaxDso.Rds")load("data/stanSoftmaxDso.Rds")fit <- sampling(modelSoftmax,
data=dataListSoftmax,
pars=c('beta0', 'beta'),
iter=5000, chains = 2, cores = 2)Analyze fitted model using shinystan, but check directly
for purposely demonstration.
summary(fit)$summary[,c(1,4,8:10)] mean 2.5% 97.5% n_eff Rhat
beta0[1] 0.0000000 0.000000 0.0000000 NaN NaN
beta0[2] -3.8747761 -5.287726 -2.6498584 2424.806 1.0004543
beta0[3] -3.4383430 -4.758058 -2.2832007 3046.880 1.0003798
beta0[4] -3.8239785 -5.174666 -2.6048550 2777.099 1.0014128
beta[1,1] 0.0000000 0.000000 0.0000000 NaN NaN
beta[1,2] 0.0000000 0.000000 0.0000000 NaN NaN
beta[2,1] -5.5258762 -7.266858 -4.0493728 2509.455 1.0003697
beta[2,2] -5.3813396 -7.078252 -3.9099912 2562.798 1.0001610
beta[3,1] -1.4477120 -2.502389 -0.4137045 3499.414 1.0012289
beta[3,2] 8.7504970 6.700614 11.0491097 2771.076 0.9998857
beta[4,1] 8.1935589 6.190566 10.3753359 3002.059 1.0013696
beta[4,2] -0.7384386 -1.710137 0.1956379 3498.552 1.0011204
lp__ -123.5101370 -128.879940 -120.2345545 2018.045 1.0005849
The model has 4 classes and 2 predictors. The returned coefficients form a matrix \[\lambda_{i,1} = \beta_{0,1} + \beta{1,1} x_{i,1} + \beta_{2,1} x_{i,2} \\ \lambda_{i,2} = \beta_{0,2} + \beta{1,2} x_{i,1} + \beta_{2,2} x_{i,2} \\ \lambda_{i,3} = \beta_{0,3} + \beta{1,3} x_{i,1} + \beta_{2,31} x_{i,2} \\ \lambda_{i,4} = \beta_{0,4} + \beta{1,4} x_{i,1} + \beta_{2,4} x_{i,2}\]
In this system first class is selected as reference, so \(\beta_{0,1} = \beta_{1,1} = \beta_{2,1} = 0\).
stan_ac(fit, separate_chains = T)stan_trace(fit)pairs(fit,pars=c("beta0"))pairs(fit,pars=c("beta"))plot(fit)To predict classes use formula for probability of class \(k\) \[p_k = \frac{e^{\lambda_k}}{\sum_s e^{\lambda_s}}.\]
Create matrix of coefficients.
SoftmaxCoeff<-summary(fit)$summary[1:12,c(1)]
SoftmaxCoeff<-cbind(SoftmaxCoeff[1:4],matrix(SoftmaxCoeff[-(1:4)],ncol=2,byrow=T))
rownames(SoftmaxCoeff)<-paste0("Class",1:4)
SoftmaxCoeff [,1] [,2] [,3]
Class1 0.000000 0.000000 0.0000000
Class2 -3.874776 -5.525876 -5.3813396
Class3 -3.438343 -1.447712 8.7504970
Class4 -3.823979 8.193559 -0.7384386
Create linear predictors.
head(myData) X1 X2 Y
1 -0.08714736 -1.0813422 2
2 -0.72256565 -1.5838631 2
3 0.17918961 0.9717904 3
4 -1.15975176 0.5026244 3
5 -0.72711762 1.3757045 3
6 0.53341559 1.7746506 3
linPredictors<-apply(SoftmaxCoeff[,-1],1,function(z) z%*%t(myData[,1:2]))
dim(linPredictors)[1] 475 4
head(linPredictors) Class1 Class2 Class3 Class4
[1,] 0 6.300635 -9.336117 0.08445775
[2,] 0 12.516113 -12.813522 -4.75079860
[3,] 0 -6.219714 8.244234 0.75059308
[4,] 0 3.703852 6.077200 -9.87365165
[5,] 0 -3.385171 13.090755 -6.97355432
[6,] 0 -12.497586 14.756843 3.06010158
linPredictors<-t(apply(linPredictors,1,function(z) z+SoftmaxCoeff[,1]))
dim(linPredictors)[1] 475 4
head(linPredictors) Class1 Class2 Class3 Class4
[1,] 0 2.4258589 -12.774460 -3.7395208
[2,] 0 8.6413373 -16.251865 -8.5747771
[3,] 0 -10.0944901 4.805891 -3.0733854
[4,] 0 -0.1709239 2.638857 -13.6976302
[5,] 0 -7.2599471 9.652412 -10.7975328
[6,] 0 -16.3723623 11.318500 -0.7638769
Check calculation for the first row of the data and second class.
row1<-myData[1,]
Class2<-SoftmaxCoeff[2,1]+SoftmaxCoeff[2,2]*row1[1]+SoftmaxCoeff[2,3]*row1[2]
c(Class2,linPredictors[1,2])$X1
[1] 2.425859
$Class2
[1] 2.425859
Create probabilities
softmaxProb<-exp(linPredictors)/apply(exp(linPredictors),1,sum)
apply(head(softmaxProb),1,sum)[1] 1 1 1 1 1 1
Predict classes.
predClass<-apply(softmaxProb,1,which.max)
head(predClass)[1] 2 2 3 3 3 3
Plot predicted classes and compare them with the data.
idx2Pred<-predClass==2
idx3Pred<-predClass==3
idx4Pred<-predClass==4
par(mfrow=c(1,2))
plot(myData$X1,myData$X2,pch=16)
points(myData$X1[idx2],myData$X2[idx2],pch=16,col="orange")
points(myData$X1[idx3],myData$X2[idx3],pch=16,col="cyan")
points(myData$X1[idx4],myData$X2[idx4],pch=16,col="magenta")
plot(myData$X1,myData$X2,pch=16)
points(myData$X1[idx2Pred],myData$X2[idx2Pred],pch=16,col="orange")
points(myData$X1[idx3Pred],myData$X2[idx3Pred],pch=16,col="cyan")
points(myData$X1[idx4Pred],myData$X2[idx4Pred],pch=16,col="magenta")par(mfrow=c(1,1))See how different classes are separated by hyperplanes.
Add hyperplane between class 1 and class 2:
plot(myData$X1,myData$X2,pch=16)
points(myData$X1[idx2Pred],myData$X2[idx2Pred],pch=16,col="orange")
points(myData$X1[idx3Pred],myData$X2[idx3Pred],pch=16,col="cyan")
points(myData$X1[idx4Pred],myData$X2[idx4Pred],pch=16,col="magenta")
lines(myData$X1,-(SoftmaxCoeff[2,1]+SoftmaxCoeff[2,2]*myData$X1)/SoftmaxCoeff[2,3],col="grey")Add hyperplane between class 1 and class 3.
plot(myData$X1,myData$X2,pch=16)
points(myData$X1[idx2Pred],myData$X2[idx2Pred],pch=16,col="orange")
points(myData$X1[idx3Pred],myData$X2[idx3Pred],pch=16,col="cyan")
points(myData$X1[idx4Pred],myData$X2[idx4Pred],pch=16,col="magenta")
lines(myData$X1,-(SoftmaxCoeff[2,1]+SoftmaxCoeff[2,2]*myData$X1)/SoftmaxCoeff[2,3],col="grey")
lines(myData$X1,-(SoftmaxCoeff[3,1]+SoftmaxCoeff[3,2]*myData$X1)/SoftmaxCoeff[3,3],col="grey")Add hyperplane between class 1 and class 4.
plot(myData$X1,myData$X2,pch=16)
points(myData$X1[idx2Pred],myData$X2[idx2Pred],pch=16,col="orange")
points(myData$X1[idx3Pred],myData$X2[idx3Pred],pch=16,col="cyan")
points(myData$X1[idx4Pred],myData$X2[idx4Pred],pch=16,col="magenta")
lines(myData$X1,-(SoftmaxCoeff[2,1]+SoftmaxCoeff[2,2]*myData$X1)/SoftmaxCoeff[2,3],col="grey")
lines(myData$X1,-(SoftmaxCoeff[3,1]+SoftmaxCoeff[3,2]*myData$X1)/SoftmaxCoeff[3,3],col="grey")
lines(myData$X1,-(SoftmaxCoeff[4,1]+SoftmaxCoeff[4,2]*myData$X1)/SoftmaxCoeff[4,3],col="grey")If you see mistakes or want to suggest changes, please create an issue on the source repository.
Text and figures are licensed under Creative Commons Attribution CC BY 4.0. Source code is available at https://github.com/hai-mn/hai-mn.github.io, unless otherwise noted. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".
For attribution, please cite this work as
Nguyen (2022, March 20). HaiBiostat: Series 7 -- Fitting a Logistic Regression in Bayes. Retrieved from https://hai-mn.github.io/posts/2022-03-20-Bayesian methods - Series 7 of 10/
BibTeX citation
@misc{nguyen2022series,
author = {Nguyen, Hai},
title = {HaiBiostat: Series 7 -- Fitting a Logistic Regression in Bayes},
url = {https://hai-mn.github.io/posts/2022-03-20-Bayesian methods - Series 7 of 10/},
year = {2022}
}